Dataset: Loan Status Prediction
https://www.kaggle.com/datasets/bhavikjikadara/loan-status-prediction/data
Team Members: Ankit Malhotra, Nathaniel Zhu
versioninfo()
Julia Version 1.10.0 Commit 3120989f39b (2023-12-25 18:01 UTC) Build Info: Official https://julialang.org/ release Platform Info: OS: macOS (arm64-apple-darwin22.4.0) CPU: 12 × Apple M2 Pro WORD_SIZE: 64 LIBM: libopenlibm LLVM: libLLVM-15.0.7 (ORCJIT, apple-m1) Threads: 1 on 8 virtual cores Environment: JULIA_NUM_THREADS =
import Pkg
Pkg.activate()
Pkg.status()
Activating project at `~/.julia/environments/v1.10`
Status `~/.julia/environments/v1.10/Project.toml` [cbdf2221] AlgebraOfGraphics v0.6.18 [024491cd] BetaML v0.11.4 ⌃ [336ed68f] CSV v0.10.12 [13f3f980] CairoMakie v0.11.9 [e2e10f9a] CatBoost v0.3.4 [324d7699] CategoricalArrays v0.10.8 [aaaa29a8] Clustering v0.15.7 [8f4d0f93] Conda v1.10.0 [a93c6f00] DataFrames v1.6.1 ⌃ [1313f7d8] DataFramesMeta v0.14.1 [b4f34e82] Distances v0.10.11 [31c24e10] Distributions v0.25.107 ⌃ [5789e2e9] FileIO v1.16.2 ⌃ [587475ba] Flux v0.14.12 [da1fdf0e] FreqTables v0.4.6 [38e38edf] GLM v1.9.0 [8d5ece8b] GLMNet v0.7.2 [e9467ef8] GLMakie v0.9.9 [09f84164] HypothesisTests v0.11.0 [7073ff75] IJulia v1.24.2 [4e3cecfd] ImageShow v0.3.8 [f0e99cf1] MLBase v0.9.2 [eb30cadb] MLDatasets v0.7.14 ⌃ [add582a8] MLJ v0.20.2 [d354fa79] MLJClusteringInterface v0.1.11 [c6f25543] MLJDecisionTreeInterface v0.4.1 [094fc8d1] MLJFlux v0.4.0 [caf8df21] MLJGLMInterface v0.3.7 [61c7150f] MLJLIBSVMInterface v0.2.1 [6ee0df7b] MLJLinearModels v0.10.0 ⌃ [d491faf4] MLJModels v0.16.15 [1b6a4a23] MLJMultivariateStatsInterface v0.5.3 [33e4bacb] MLJNaiveBayesInterface v0.1.6 [5ae90465] MLJScikitLearnInterface v0.6.1 [636a865e] NearestNeighborModels v0.2.3 [8b842266] PalmerPenguins v0.1.4 ⌃ [91a5bcdd] Plots v1.40.1 [8162dcfd] PrettyPrint v0.2.0 [ce6b1742] RDatasets v0.7.7 [321657f4] ScientificTypes v3.0.2 [8e980c4a] Shapefile v0.12.0 [de6bee2f] SimpleChains v0.4.6 [860ef19b] StableRNGs v1.0.1 [2913bbd2] StatsBase v0.34.2 [f3b207a7] StatsPlots v0.15.7 [40c74d1a] TableView v0.7.2 [fdbf4ff8] XLSX v0.10.1 [37e2e46d] LinearAlgebra [9a3f8284] Random Info Packages marked with ⌃ have new versions available and may be upgradable.
# Loading Necessary Packages
using CSV, DataFrames, Shapefile
using CategoricalArrays, FreqTables
using Plots, StatsPlots, Statistics
using LinearAlgebra, StatsBase, HypothesisTests
using Distributions, Random, StableRNGs
using PalmerPenguins, RDatasets
using MLJ, NearestNeighborModels, MLJScikitLearnInterface, MLJMultivariateStatsInterface
using MLJDecisionTreeInterface, MLJLinearModels
using CatBoost
InitError: Python: ModuleNotFoundError: No module named 'catboost'
Python stacktrace: none
during initialization of module CatBoost
Stacktrace:
[1] pythrow()
@ PythonCall ~/.julia/packages/PythonCall/wXfah/src/err.jl:94
[2] errcheck
@ PythonCall ~/.julia/packages/PythonCall/wXfah/src/err.jl:10 [inlined]
[3] pyimport(m::String)
@ PythonCall ~/.julia/packages/PythonCall/wXfah/src/concrete/import.jl:11
[4] __init__()
@ CatBoost ~/.julia/packages/CatBoost/TiqIz/src/CatBoost.jl:16
[5] run_module_init(mod::Module, i::Int64)
@ Base ./loading.jl:1128
[6] register_restored_modules(sv::Core.SimpleVector, pkg::Base.PkgId, path::String)
@ Base ./loading.jl:1116
[7] _include_from_serialized(pkg::Base.PkgId, path::String, ocachepath::String, depmods::Vector{Any})
@ Base ./loading.jl:1061
[8] _require_search_from_serialized(pkg::Base.PkgId, sourcepath::String, build_id::UInt128)
@ Base ./loading.jl:1575
[9] _require(pkg::Base.PkgId, env::String)
@ Base ./loading.jl:1932
[10] __require_prelocked(uuidkey::Base.PkgId, env::String)
@ Base ./loading.jl:1806
[11] #invoke_in_world#3
@ Base ./essentials.jl:921 [inlined]
[12] invoke_in_world
@ Base ./essentials.jl:918 [inlined]
[13] _require_prelocked(uuidkey::Base.PkgId, env::String)
@ Base ./loading.jl:1797
[14] macro expansion
@ Base ./loading.jl:1784 [inlined]
[15] macro expansion
@ Base ./lock.jl:267 [inlined]
[16] __require(into::Module, mod::Symbol)
@ Base ./loading.jl:1747
[17] #invoke_in_world#3
@ Base ./essentials.jl:921 [inlined]
[18] invoke_in_world
@ Base ./essentials.jl:918 [inlined]
[19] require(into::Module, mod::Symbol)
@ Base ./loading.jl:1740
[20] eval
@ ./boot.jl:385 [inlined]
[21] include_string(mapexpr::typeof(REPL.softscope), mod::Module, code::String, filename::String)
@ Base ./loading.jl:2070
[22] #invokelatest#2
@ ./essentials.jl:887 [inlined]
[23] invokelatest
@ ./essentials.jl:884 [inlined]
[24] (::VSCodeServer.var"#214#215"{VSCodeServer.NotebookRunCellArguments, String})()
@ VSCodeServer ~/.vscode/extensions/julialang.language-julia-1.75.2/scripts/packages/VSCodeServer/src/serve_notebook.jl:19
[25] withpath(f::VSCodeServer.var"#214#215"{VSCodeServer.NotebookRunCellArguments, String}, path::String)
@ VSCodeServer ~/.vscode/extensions/julialang.language-julia-1.75.2/scripts/packages/VSCodeServer/src/repl.jl:274
[26] notebook_runcell_request(conn::VSCodeServer.JSONRPC.JSONRPCEndpoint{Base.PipeEndpoint, Base.PipeEndpoint}, params::VSCodeServer.NotebookRunCellArguments)
@ VSCodeServer ~/.vscode/extensions/julialang.language-julia-1.75.2/scripts/packages/VSCodeServer/src/serve_notebook.jl:13
[27] dispatch_msg(x::VSCodeServer.JSONRPC.JSONRPCEndpoint{Base.PipeEndpoint, Base.PipeEndpoint}, dispatcher::VSCodeServer.JSONRPC.MsgDispatcher, msg::Dict{String, Any})
@ VSCodeServer.JSONRPC ~/.vscode/extensions/julialang.language-julia-1.75.2/scripts/packages/JSONRPC/src/typed.jl:67
[28] serve_notebook(pipename::String, outputchannel_logger::Base.CoreLogging.SimpleLogger; crashreporting_pipename::String)
@ VSCodeServer ~/.vscode/extensions/julialang.language-julia-1.75.2/scripts/packages/VSCodeServer/src/serve_notebook.jl:139
[29] top-level scope
@ ~/.vscode/extensions/julialang.language-julia-1.75.2/scripts/notebook/notebook.jl:35
About this dataset and our eventual goal:
# Read the CSV file into a DataFrames
loan_data = CSV.read("Data/loan_data.csv", DataFrame)
# Display first few rows
first(loan_data, 10)
| Row | Loan_ID | Gender | Married | Dependents | Education | Self_Employed | ApplicantIncome | CoapplicantIncome | LoanAmount | Loan_Amount_Term | Credit_History | Property_Area | Loan_Status |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| String15 | String7? | String3 | String3? | String15 | String3? | Int64 | Float64 | Float64 | Float64? | Float64? | String15 | String1 | |
| 1 | LP001003 | Male | Yes | 1 | Graduate | No | 4583 | 1508.0 | 128.0 | 360.0 | 1.0 | Rural | N |
| 2 | LP001005 | Male | Yes | 0 | Graduate | Yes | 3000 | 0.0 | 66.0 | 360.0 | 1.0 | Urban | Y |
| 3 | LP001006 | Male | Yes | 0 | Not Graduate | No | 2583 | 2358.0 | 120.0 | 360.0 | 1.0 | Urban | Y |
| 4 | LP001008 | Male | No | 0 | Graduate | No | 6000 | 0.0 | 141.0 | 360.0 | 1.0 | Urban | Y |
| 5 | LP001013 | Male | Yes | 0 | Not Graduate | No | 2333 | 1516.0 | 95.0 | 360.0 | 1.0 | Urban | Y |
| 6 | LP001024 | Male | Yes | 2 | Graduate | No | 3200 | 700.0 | 70.0 | 360.0 | 1.0 | Urban | Y |
| 7 | LP001027 | Male | Yes | 2 | Graduate | missing | 2500 | 1840.0 | 109.0 | 360.0 | 1.0 | Urban | Y |
| 8 | LP001029 | Male | No | 0 | Graduate | No | 1853 | 2840.0 | 114.0 | 360.0 | 1.0 | Rural | N |
| 9 | LP001030 | Male | Yes | 2 | Graduate | No | 1299 | 1086.0 | 17.0 | 120.0 | 1.0 | Urban | Y |
| 10 | LP001032 | Male | No | 0 | Graduate | No | 4950 | 0.0 | 125.0 | 360.0 | 1.0 | Urban | Y |
names(loan_data)
13-element Vector{String}:
"Loan_ID"
"Gender"
"Married"
"Dependents"
"Education"
"Self_Employed"
"ApplicantIncome"
"CoapplicantIncome"
"LoanAmount"
"Loan_Amount_Term"
"Credit_History"
"Property_Area"
"Loan_Status"
# Get an overview of the Data
describe(loan_data)
| Row | variable | mean | min | median | max | nmissing | eltype |
|---|---|---|---|---|---|---|---|
| Symbol | Union… | Any | Union… | Any | Int64 | Type | |
| 1 | Loan_ID | LP001003 | LP002990 | 0 | String15 | ||
| 2 | Gender | Female | Male | 5 | Union{Missing, String7} | ||
| 3 | Married | No | Yes | 0 | String3 | ||
| 4 | Dependents | 0 | 3+ | 8 | Union{Missing, String3} | ||
| 5 | Education | Graduate | Not Graduate | 0 | String15 | ||
| 6 | Self_Employed | No | Yes | 21 | Union{Missing, String3} | ||
| 7 | ApplicantIncome | 3579.85 | 150 | 3333.0 | 9703 | 0 | Int64 |
| 8 | CoapplicantIncome | 1277.28 | 0.0 | 983.0 | 33837.0 | 0 | Float64 |
| 9 | LoanAmount | 104.987 | 9.0 | 110.0 | 150.0 | 0 | Float64 |
| 10 | Loan_Amount_Term | 340.865 | 12.0 | 360.0 | 480.0 | 11 | Union{Missing, Float64} |
| 11 | Credit_History | 0.837607 | 0.0 | 1.0 | 1.0 | 30 | Union{Missing, Float64} |
| 12 | Property_Area | Rural | Urban | 0 | String15 | ||
| 13 | Loan_Status | N | Y | 0 | String1 |
# Check the size of the DataFrame
size(loan_data)
(381, 13)
Couple of points to note here :
Ways of dealing with missing data :
# Remove rows with any missing values
clean_data = dropmissing(loan_data)
# Check size again after cleaning
size(clean_data)
(308, 13)
# Check for missing values
for col in names(clean_data)
missing_count = sum(ismissing, clean_data[!, col])
println("Column: $col has $missing_count missing values")
end
Column: Loan_ID has 0 missing values Column: Gender has 0 missing values Column: Married has 0 missing values Column: Dependents has 0 missing values Column: Education has 0 missing values Column: Self_Employed has 0 missing values Column: ApplicantIncome has 0 missing values Column: CoapplicantIncome has 0 missing values Column: LoanAmount has 0 missing values Column: Loan_Amount_Term has 0 missing values Column: Credit_History has 0 missing values Column: Property_Area has 0 missing values Column: Loan_Status has 0 missing values
After cleaning the data, it also appears that some columns are marked as String, let us convert these columns back to Categorical for better analysis in Julia.
# Copy clean_data to a new DataFrame called real_data to preserve the original data
real_data = copy(clean_data)
# Transforming string variables into numeric variables
real_data[!, :Loan_Status] = ifelse.(real_data[!, :Loan_Status] .== "Y", 1, 0)
real_data[!, :Gender] = ifelse.(real_data[!, :Gender] .== "Female", 1, 0)
real_data[!, :Married] = ifelse.(real_data[!, :Married] .== "Yes", 1, 0)
real_data[!, :Education] = ifelse.(real_data[!, :Education] .== "Graduate", 1, 0)
real_data[!, :Self_Employed] = ifelse.(real_data[!, :Self_Employed] .== "Yes", 1, 0)
# Transform Property_Area into numeric variable with Urban -> 2, SemiUrban -> 1, Rural -> 0
real_data[!, :Property_Area] = map(x -> x == "Urban" ? 2 : x == "Semiurban" ? 1 : 0, real_data[!, :Property_Area])
# Replace "3+" with 3 in Dependents and convert to integer
real_data[!, :Dependents] = replace.(real_data[!, :Dependents], "3+" => "3")
real_data[!, :Dependents] = parse.(Int, real_data[!, :Dependents])
describe(real_data)
| Row | variable | mean | min | median | max | nmissing | eltype |
|---|---|---|---|---|---|---|---|
| Symbol | Union… | Any | Union… | Any | Int64 | DataType | |
| 1 | Loan_ID | LP001003 | LP002990 | 0 | String15 | ||
| 2 | Gender | 0.204545 | 0 | 0.0 | 1 | 0 | Int64 |
| 3 | Married | 0.600649 | 0 | 1.0 | 1 | 0 | Int64 |
| 4 | Dependents | 0.678571 | 0 | 0.0 | 3 | 0 | Int64 |
| 5 | Education | 0.743506 | 0 | 1.0 | 1 | 0 | Int64 |
| 6 | Self_Employed | 0.0909091 | 0 | 0.0 | 1 | 0 | Int64 |
| 7 | ApplicantIncome | 3599.13 | 150 | 3329.5 | 9703 | 0 | Int64 |
| 8 | CoapplicantIncome | 1278.43 | 0.0 | 871.5 | 33837.0 | 0 | Float64 |
| 9 | LoanAmount | 104.623 | 9.0 | 110.0 | 150.0 | 0 | Float64 |
| 10 | Loan_Amount_Term | 341.182 | 36.0 | 360.0 | 480.0 | 0 | Float64 |
| 11 | Credit_History | 0.853896 | 0.0 | 1.0 | 1.0 | 0 | Float64 |
| 12 | Property_Area | 1.04221 | 0 | 1.0 | 2 | 0 | Int64 |
| 13 | Loan_Status | 0.711039 | 0 | 1.0 | 1 | 0 | Int64 |
Now it is time for the Visualization. Below, there are some graphs consisted of all possible visualization for all variables. Please feel free to take a look of our basic data visualization.
# Calculate the counts for each Loan_Status category
loan_status_counts = combine(groupby(clean_data, :Loan_Status), nrow => :count)
# Generate the bar plot
bar_plot = bar(loan_status_counts.Loan_Status, loan_status_counts.count,
xlabel = "Loan Status",
ylabel = "Count",
title = "Distribution of Loan Status",
legend = false)
# Add the count annotations on top of each bar
for (x, y) in zip(loan_status_counts.Loan_Status, loan_status_counts.count)
annotate!(bar_plot, [(x, y, text(string(y), 8, :center, :bottom))])
end
# Printing Output
bar_plot
# 2. Applicant Income By Loan Status
boxplot(real_data[!, :Loan_Status], real_data[!, :ApplicantIncome],
title = "Applicant Income by Loan Status",
ylabel = "Applicant Income",
legend = false
)
# 3. Gender Histogram
histogram(real_data[!, :Gender], bins = 2, title = "Gender Distribution", legend = false)
# 4. Married Histogram
histogram(real_data[!, :Married], bins = 2, title = "Married Status Distribution", legend = false)
# 5. Dependent Histogram
histogram(real_data[!, :Dependents], title = "Dependents Distribution", legend = false)
# 6. Education Histogram
histogram(real_data[!, :Education], bins = 2, title = "Education Distribution", legend = false)
# 7. Self_Employed Histogram
histogram(real_data[!, :Self_Employed], bins = 2, title = "Self Employed Distribution", legend = false)
# 8. ApplicantIncome Histogram
histogram(real_data[!, :ApplicantIncome], title = "Applicant Income Distribution", xlabel = "Income", ylabel = "Frequency", legend = false)
# 9. CoApplicantIncome Histogram
histogram(real_data[!, :CoapplicantIncome], title = "Co-Applicant Income Distribution", xlabel = "Income", ylabel = "Frequency", legend = false)
# 10. LoanAmount Histogram
loan_amount_hist = histogram(real_data[!, :LoanAmount], title = "Loan Amount Distribution", xlabel = "Amount", ylabel = "Frequency", legend = false)
# 11. Loan_Amount_Term Histogram
loan_amount_term_hist = histogram(real_data[!, :Loan_Amount_Term], title = "Loan Amount Term Distribution", xlabel = "Term", ylabel = "Frequency", legend = false)
# 12. Credit History Histogram
credit_history_hist = histogram(real_data[!, :Credit_History], bins = 2, title = "Credit History Distribution", legend = false)
# 13. Property Area Histogram
property_area_hist = histogram(real_data[!, :Property_Area], title = "Property Area Distribution", xlabel = "Area", ylabel = "Frequency", legend = false)
# 14. Property_Area vs Loan Amount Dot plot
xs = real_data[!, :Property_Area]
ys = real_data[!, :LoanAmount]
dotplot(xs, ys,
title = "Property Area and Loan Amount Dot Plot",
xlabel = "Property Area",
ylabel = "Loan Amount",
group = xs,
)
# 15. Property_Area vs
xs = real_data[!, :Property_Area]
ys = real_data[!, :Loan_Amount_Term]
violin(xs, ys,
title = "Property and Loan Amount Term Violin Plot",
xlabel = "Property Area",
ylabel = "Loan_Amount_Term",
group = xs,
)
# 16. Correlation Plot Analysis part 1
selected_cols = [
:Gender,
:Married,
:Education,
:Dependents,
:Self_Employed,
:Property_Area,
:Loan_Status
]
@df real_data corrplot(cols(selected_cols),
size = (1000, 1000),
bins = 32,
title = "Correlation Plot",
xlabel = "Variable",
ylabel = "Variable",
labelfontsize = 8,
tickfontsize = 5,
linewidth = 2,
markersize = 10,
clim = (-1, 1),
colorbar_title = "Correlation Coefficient"
)
# 17. Correlation Plot Analysis part 2
selected_cols = [
:ApplicantIncome,
:LoanAmount,
:Loan_Amount_Term,
:Credit_History,
:Loan_Status
]
@df real_data corrplot(cols(selected_cols),
size = (1000, 1000),
bins = 32,
title = "Correlation Plot",
xlabel = "Variable",
ylabel = "Variable",
labelfontsize = 8,
tickfontsize = 5,
linewidth = 2,
markersize = 10,
clim = (-1, 1),
colorbar_title = "Correlation Coefficient"
)
# 18. Correlation Matrix for All Variables
M = cor(Matrix(real_data[!, 2:13]))
vars = names(real_data)[2:13]
fig = heatmap(M,
title = "Correlation Matrix",
clims = (-1, 1),
xticks = (2:13, vars),
yticks = (2:13, vars),
color = cgrad(:balance, rev = true),
xrot = 45,
aspect = :ratio,
size = (700, 600),
)
for j in axes(M, 2), i in axes(M, 1)
annotate!(i, j, text("$(round(M[i,j], digits = 2))", :white, 12))
end
fig
select!(real_data, Not(:Loan_ID))
| Row | Gender | Married | Dependents | Education | Self_Employed | ApplicantIncome | CoapplicantIncome | LoanAmount | Loan_Amount_Term | Credit_History | Property_Area | Loan_Status |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| Int64 | Int64 | Int64 | Int64 | Int64 | Int64 | Float64 | Float64 | Float64 | Float64 | Int64 | Int64 | |
| 1 | 0 | 1 | 1 | 1 | 0 | 4583 | 1508.0 | 128.0 | 360.0 | 1.0 | 0 | 0 |
| 2 | 0 | 1 | 0 | 1 | 1 | 3000 | 0.0 | 66.0 | 360.0 | 1.0 | 2 | 1 |
| 3 | 0 | 1 | 0 | 0 | 0 | 2583 | 2358.0 | 120.0 | 360.0 | 1.0 | 2 | 1 |
| 4 | 0 | 0 | 0 | 1 | 0 | 6000 | 0.0 | 141.0 | 360.0 | 1.0 | 2 | 1 |
| 5 | 0 | 1 | 0 | 0 | 0 | 2333 | 1516.0 | 95.0 | 360.0 | 1.0 | 2 | 1 |
| 6 | 0 | 1 | 2 | 1 | 0 | 3200 | 700.0 | 70.0 | 360.0 | 1.0 | 2 | 1 |
| 7 | 0 | 0 | 0 | 1 | 0 | 1853 | 2840.0 | 114.0 | 360.0 | 1.0 | 0 | 0 |
| 8 | 0 | 1 | 2 | 1 | 0 | 1299 | 1086.0 | 17.0 | 120.0 | 1.0 | 2 | 1 |
| 9 | 0 | 0 | 0 | 1 | 0 | 4950 | 0.0 | 125.0 | 360.0 | 1.0 | 2 | 1 |
| 10 | 1 | 0 | 0 | 1 | 0 | 3510 | 0.0 | 76.0 | 360.0 | 0.0 | 2 | 0 |
| 11 | 0 | 1 | 0 | 0 | 0 | 4887 | 0.0 | 133.0 | 360.0 | 1.0 | 0 | 0 |
| 12 | 0 | 1 | 0 | 0 | 0 | 7660 | 0.0 | 104.0 | 360.0 | 0.0 | 2 | 0 |
| 13 | 0 | 1 | 0 | 0 | 0 | 2600 | 1911.0 | 116.0 | 360.0 | 0.0 | 1 | 0 |
| ⋮ | ⋮ | ⋮ | ⋮ | ⋮ | ⋮ | ⋮ | ⋮ | ⋮ | ⋮ | ⋮ | ⋮ | ⋮ |
| 297 | 0 | 1 | 1 | 1 | 0 | 2787 | 1917.0 | 146.0 | 360.0 | 0.0 | 0 | 0 |
| 298 | 0 | 1 | 0 | 1 | 0 | 2297 | 1522.0 | 104.0 | 360.0 | 1.0 | 2 | 1 |
| 299 | 1 | 0 | 0 | 0 | 0 | 2165 | 0.0 | 70.0 | 360.0 | 1.0 | 1 | 1 |
| 300 | 0 | 1 | 2 | 1 | 1 | 2726 | 0.0 | 106.0 | 360.0 | 0.0 | 1 | 0 |
| 301 | 0 | 1 | 0 | 1 | 0 | 3000 | 3416.0 | 56.0 | 180.0 | 1.0 | 1 | 1 |
| 302 | 0 | 1 | 0 | 1 | 0 | 3859 | 3300.0 | 142.0 | 180.0 | 1.0 | 0 | 1 |
| 303 | 0 | 0 | 0 | 0 | 0 | 3833 | 0.0 | 110.0 | 360.0 | 1.0 | 0 | 1 |
| 304 | 0 | 1 | 3 | 1 | 0 | 5703 | 0.0 | 128.0 | 360.0 | 1.0 | 2 | 1 |
| 305 | 0 | 1 | 0 | 1 | 0 | 3232 | 1950.0 | 108.0 | 360.0 | 1.0 | 0 | 1 |
| 306 | 1 | 0 | 0 | 1 | 0 | 2900 | 0.0 | 71.0 | 360.0 | 1.0 | 0 | 1 |
| 307 | 0 | 1 | 3 | 1 | 0 | 4106 | 0.0 | 40.0 | 180.0 | 1.0 | 0 | 1 |
| 308 | 1 | 0 | 0 | 1 | 1 | 4583 | 0.0 | 133.0 | 360.0 | 0.0 | 1 | 0 |
hot = MLJ.fit!(machine(OneHotEncoder(), real_data))
# apply the dummy coding scheme; note that we qualify `transform`
data_hot_encoded = MLJ.transform(hot, real_data)
# check
schema(data_hot_encoded)
┌ Info: Training machine(OneHotEncoder(features = Symbol[], …), …). └ @ MLJBase /Users/nathanielzhu/.julia/packages/MLJBase/mIaqI/src/machines.jl:493
┌───────────────────┬────────────┬─────────┐ │ names │ scitypes │ types │ ├───────────────────┼────────────┼─────────┤ │ Gender │ Count │ Int64 │ │ Married │ Count │ Int64 │ │ Dependents │ Count │ Int64 │ │ Education │ Count │ Int64 │ │ Self_Employed │ Count │ Int64 │ │ ApplicantIncome │ Count │ Int64 │ │ CoapplicantIncome │ Continuous │ Float64 │ │ LoanAmount │ Continuous │ Float64 │ │ Loan_Amount_Term │ Continuous │ Float64 │ │ Credit_History │ Continuous │ Float64 │ │ Property_Area │ Count │ Int64 │ │ Loan_Status │ Count │ Int64 │ └───────────────────┴────────────┴─────────┘
rng = StableRNG(1997)
train, test = partition(data_hot_encoded, 0.7; rng = rng, shuffle = true)
(216×12 DataFrame Row │ Gender Married Dependents Education Self_Employed ApplicantIncome ⋯ │ Int64 Int64 Int64 Int64 Int64 Int64 ⋯ ─────┼────────────────────────────────────────────────────────────────────────── 1 │ 0 0 0 0 0 2333 ⋯ 2 │ 1 1 1 1 0 4608 3 │ 0 0 0 1 1 7167 4 │ 1 0 0 1 0 3159 5 │ 1 0 1 0 0 4606 ⋯ 6 │ 0 1 0 1 0 3232 7 │ 0 1 2 1 0 5935 8 │ 1 0 0 0 0 3400 ⋮ │ ⋮ ⋮ ⋮ ⋮ ⋮ ⋮ ⋱ 210 │ 0 1 3 1 0 3400 ⋯ 211 │ 0 1 0 0 0 3814 212 │ 0 1 2 1 0 4400 213 │ 0 1 0 1 0 5488 214 │ 1 0 0 1 0 1811 ⋯ 215 │ 0 1 1 0 0 3500 216 │ 1 0 0 1 0 5000 6 columns and 201 rows omitted, 92×12 DataFrame Row │ Gender Married Dependents Education Self_Employed ApplicantIncome ⋯ │ Int64 Int64 Int64 Int64 Int64 Int64 ⋯ ─────┼────────────────────────────────────────────────────────────────────────── 1 │ 0 0 0 1 0 6277 ⋯ 2 │ 0 1 3 0 1 7100 3 │ 0 0 0 1 0 2500 4 │ 0 1 1 1 0 2750 5 │ 1 1 0 1 0 2484 ⋯ 6 │ 0 1 0 1 0 3246 7 │ 0 0 0 1 0 6000 8 │ 0 1 0 1 0 2958 ⋮ │ ⋮ ⋮ ⋮ ⋮ ⋮ ⋮ ⋱ 86 │ 1 0 0 1 0 3762 ⋯ 87 │ 0 1 0 1 0 4860 88 │ 0 1 0 1 0 3597 89 │ 1 0 1 1 1 8624 90 │ 0 1 1 1 0 2882 ⋯ 91 │ 1 1 0 0 1 7142 92 │ 0 1 3 1 0 8750 6 columns and 77 rows omitted)
y = categorical((real_data[!, :Loan_Status]), levels = [0,1])
X = select(real_data, Not([:Loan_Status]))
levels(y)
2-element Vector{Int64}:
0
1
train, test = partition(eachindex(y), 0.8, shuffle=true, rng=1997);
# checking best models for our work
for m in models(matching(X, y))
println("""
[Model: $(m.name)]
\t prediction type: $(m.prediction_type)
\t source package: $(m.package_name)
""")
end
[Model: AdaBoostStumpClassifier] prediction type: probabilistic source package: DecisionTree [Model: CatBoostClassifier] prediction type: probabilistic source package: CatBoost [Model: ConstantClassifier] prediction type: probabilistic source package: MLJModels [Model: DecisionTreeClassifier] prediction type: probabilistic source package: BetaML [Model: DecisionTreeClassifier] prediction type: probabilistic source package: DecisionTree [Model: DeterministicConstantClassifier] prediction type: deterministic source package: MLJModels [Model: EvoTreeClassifier] prediction type: probabilistic source package: EvoTrees [Model: KernelPerceptronClassifier] prediction type: probabilistic source package: BetaML [Model: NeuralNetworkClassifier] prediction type: probabilistic source package: BetaML [Model: PegasosClassifier] prediction type: probabilistic source package: BetaML [Model: PerceptronClassifier] prediction type: probabilistic source package: BetaML [Model: RandomForestClassifier] prediction type: probabilistic source package: BetaML [Model: RandomForestClassifier] prediction type: probabilistic source package: DecisionTree [Model: RandomForestClassifier] prediction type: probabilistic source package: MLJScikitLearnInterface [Model: StableForestClassifier] prediction type: probabilistic source package: SIRUS [Model: StableRulesClassifier] prediction type: probabilistic source package: SIRUS
Here are all the models that would works in this project:
[Model: AdaBoostStumpClassifier] prediction type: probabilistic source package: DecisionTree
[Model: CatBoostClassifier] prediction type: probabilistic source package: CatBoost
[Model: ConstantClassifier] prediction type: probabilistic source package: MLJModels
[Model: DecisionTreeClassifier] prediction type: probabilistic source package: BetaML
[Model: DecisionTreeClassifier] prediction type: probabilistic source package: DecisionTree
[Model: DeterministicConstantClassifier] prediction type: deterministic source package: MLJModels
[Model: EvoTreeClassifier] prediction type: probabilistic source package: EvoTrees
[Model: KernelPerceptronClassifier] prediction type: probabilistic source package: BetaML
[Model: NeuralNetworkClassifier] prediction type: probabilistic source package: BetaML
[Model: PegasosClassifier] prediction type: probabilistic source package: BetaML
[Model: PerceptronClassifier] prediction type: probabilistic source package: BetaML
[Model: RandomForestClassifier] prediction type: probabilistic source package: BetaML
[Model: RandomForestClassifier] prediction type: probabilistic source package: DecisionTree
[Model: RandomForestClassifier] prediction type: probabilistic source package: MLJScikitLearnInterface
[Model: StableForestClassifier] prediction type: probabilistic source package: SIRUS
[Model: StableRulesClassifier] prediction type: probabilistic source package: SIRUS
KNNClassifier = @load KNNClassifier verbosity = 0
LDA = @load LDA verbosity = 0
NeuralNetworkClassifier = @load NeuralNetworkClassifier pkg = MLJFlux verbosity = 0
MultinomialClassifier = @load MultinomialClassifier verbosity = 0
CatBoostClassifier = @load CatBoostClassifier verbosity = 0 pkg = CatBoost verbosity = 0
RandomForestClassifier = @load RandomForestClassifier pkg=BetaML verbosity = 0
DecisionTreeClassifier = @load DecisionTreeClassifier pkg = DecisionTree verbosity = 0
model_list = [
KNNClassifier(K = 5), # use nearest 5-neighbors to make predictions
LDA(),
NeuralNetworkClassifier(),
MultinomialClassifier(),
CatBoostClassifier(),
RandomForestClassifier(),
DecisionTreeClassifier()
]
7-element Vector{Probabilistic}:
KNNClassifier(K = 5, …)
LDA(method = gevd, …)
NeuralNetworkClassifier(builder = Short(n_hidden = 0, …), …)
MultinomialClassifier(lambda = 2.220446049250313e-16, …)
CatBoostClassifier(iterations = 1000, …)
RandomForestClassifier(n_trees = 30, …)
DecisionTreeClassifier(max_depth = -1, …)
acc = Float64[] # accuracy()
pre = Float64[] # multiclass_precision()
rec = Float64[] # multiclass_recall()
f1s = Float64[] # f1score()
mat = [] # confusion_matrix()
Any[]
for clf in model_list
# Create a pipeline model that standardizes, then fits a classifier.
model = Pipeline(Standardizer(), clf)
# Fit the model onto the training set
mach = machine(model, X, y)
MLJ.fit!(mach, rows = train, verbosity = 0)
# Make predictions on the test set
yhat = MLJ.predict(mach, rows = test)
# Evaluate the model on the test set using selected metrics
#
# NOTES:
#
# - MLJ.predict() may give probabilistic predictions. Use mode() to collapse to a concrete target.
# - An evaluation metric F() always accepts inputs as F(fitted, observed).
push!(acc, accuracy(mode.(yhat), y[test]))
push!(pre, multiclass_precision(mode.(yhat), y[test]))
push!(rec, multiclass_recall(mode.(yhat), y[test]))
push!(f1s, f1score(mode.(yhat), y[test]))
push!(mat, ConfusionMatrix(levels = levels(y))(mode.(yhat), y[test]))
end
┌ Warning: Levels not explicitly ordered. Using the order CategoricalValue{Int64, UInt32}[0, 1]. The "positive" level is 1.
└ @ StatisticalMeasures.ConfusionMatrices /Users/nathanielzhu/.julia/packages/StatisticalMeasures/hPDX2/src/confusion_matrices.jl:339
┌ Warning: Levels not explicitly ordered. Using the order CategoricalValue{Int64, UInt32}[0, 1]. The "positive" level is 1.
└ @ StatisticalMeasures.ConfusionMatrices /Users/nathanielzhu/.julia/packages/StatisticalMeasures/hPDX2/src/confusion_matrices.jl:339
┌ Warning: Layer with Float32 parameters got Float64 input.
│ The input will be converted, but any earlier layers may be very slow.
│ layer = Dense(11 => 5, σ)
│ summary(x) = 11×1 Matrix{Float64}
└ @ Flux /Users/nathanielzhu/.julia/packages/Flux/vzwqj/src/layers/stateless.jl:60
┌ Warning: Levels not explicitly ordered. Using the order CategoricalValue{Int64, UInt32}[0, 1]. The "positive" level is 1.
└ @ StatisticalMeasures.ConfusionMatrices /Users/nathanielzhu/.julia/packages/StatisticalMeasures/hPDX2/src/confusion_matrices.jl:339
results = DataFrame(
Model = typeof.(model_list),
Accuracy = acc,
Precision = pre,
Recall = rec,
F1 = f1s
)
UndefVarError: `model_list` not defined Stacktrace: [1] top-level scope @ ~/Desktop/UCR/STAT206/Coding/Project/Project_Code.ipynb:1
@show mat[2]
glm_df = DataFrame(copy(real_data))
glm_df[!, :Credit_History] = CategoricalArray(glm_df[!, :Credit_History])
glm_df[!, :Property_Area] = CategoricalArray(glm_df[!, :Property_Area])
glm_df[!, :Loan_Status] = CategoricalArray(glm_df[!, :Loan_Status])
glm_df[!, :Education] = CategoricalArray(glm_df[!, :Education])
glm_df[!, :Gender] = CategoricalArray(glm_df[!, :Gender])
glm_df[!, :Self_Employed] = CategoricalArray(glm_df[!, :Self_Employed])
# Define the formula for logistic regression
formula = @formula(Loan_Status ~ ApplicantIncome + LoanAmount + Loan_Amount_Term + Credit_History + Property_Area)
# Fit the logistic regression model
# model = lm(@formula(Loan_Status ~ ApplicantIncome + LoanAmount + Loan_Amount_Term + Credit_History + Property_Area), glm_df)
probit = glm(@formula(Loan_Status ~ ApplicantIncome + LoanAmount + Loan_Amount_Term + Credit_History + Property_Area), glm_df, Binomial(), ProbitLink())
# Print the model summary
println(model)